{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "PIXIV_USERNAME = \"userbay\"\n",
    "PIXIV_PASSWORD = \"userpay\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pixivpy3 import *\n",
    "\n",
    "\n",
    "api = AppPixivAPI()\n",
    "# api = ByPassSniApi()  # bypass the GFW\n",
    "# api.require_appapi_hosts()\n",
    "api.set_accept_language('zh-cn')  # tags翻译成中文\n",
    "\n",
    "token = api.login(PIXIV_USERNAME, PIXIV_PASSWORD)\n",
    "user_id = token.response.user.id\n",
    "print(token.response.user)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# PixivCrawler (with pixivpy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import time\n",
    "import random\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import sqlite3 as lite\n",
    "from sqlalchemy import create_engine\n",
    "try:\n",
    "    from tqdm.notebook import tqdm  # new tqdm\n",
    "except:\n",
    "    from tqdm import tqdm_notebook as tqdm\n",
    "\n",
    "class PixivCrawler(object):\n",
    "\n",
    "    def __init__(self, api, illust_db='pixiv_illusts.db'):\n",
    "        self.api = api\n",
    "        self.illust_db = illust_db\n",
    "        self.user_info = None\n",
    "\n",
    "    def randSleep(self, base=0.1, rand=0.5):\n",
    "        \"休眠随机的时间\"\n",
    "        time.sleep(base + rand*random.random())\n",
    "\n",
    "    def GetUserDetail(self, user_id):\n",
    "        \"查询指定用户的基本信息\"\n",
    "        self.last_user = self.api.user_detail(user_id)\n",
    "        return self.last_user\n",
    "\n",
    "    def GetUserBookmarks(self, user_id, restrict='public'):\n",
    "        \"获取指定用户的收藏列表\"\n",
    "        df_list = []\n",
    "        next_qs = {'user_id': user_id, 'restrict': restrict}\n",
    "\n",
    "        user = self.GetUserDetail(user_id)\n",
    "        self.randSleep(0.1)\n",
    "\n",
    "        with tqdm(total=user.profile.total_illust_bookmarks_public,\n",
    "                  desc=\"api.user_bookmarks_illust\") as pbar:\n",
    "            while next_qs != None:\n",
    "                json_result = self.api.user_bookmarks_illust(**next_qs)\n",
    "                tmp_df = pd.DataFrame.from_dict(json_result.illusts)\n",
    "                df_list.append(tmp_df)\n",
    "                pbar.update(tmp_df.shape[0])\n",
    "                next_qs = self.api.parse_qs(json_result.next_url)\n",
    "                self.randSleep(0.1)\n",
    "\n",
    "        df = pd.concat(df_list).rename(columns={'id': 'illust_id'})\n",
    "        df['user_id'] = df.user.apply(lambda d: d['id'])\n",
    "        return df.set_index('illust_id')\n",
    "\n",
    "    def GetUserIllusts(self, user_id, type='illust'):\n",
    "        \"获取指定用户的作品列表(illusts/manga)\"\n",
    "        df_list = []\n",
    "        next_qs = {'user_id': user_id, 'type': type, 'filter': 'for_ios'}\n",
    "\n",
    "        user = self.GetUserDetail(user_id)\n",
    "        if type == 'illust':\n",
    "            total = user.profile.total_illusts\n",
    "        elif type == 'manga':\n",
    "            total = user.profile.total_manga\n",
    "        else:\n",
    "            raise Exception(\"Unsupported type=%d\" % type)\n",
    "        self.randSleep(0.1)\n",
    "\n",
    "        with tqdm(total=total, desc=\"api.user_illusts\") as pbar:\n",
    "            while next_qs != None:\n",
    "                json_result = self.api.user_illusts(**next_qs)\n",
    "                tmp_df = pd.DataFrame.from_dict(json_result.illusts)\n",
    "                df_list.append(tmp_df)\n",
    "                pbar.update(tmp_df.shape[0])\n",
    "                next_qs = self.api.parse_qs(json_result.next_url)\n",
    "                self.randSleep(0.1)\n",
    "\n",
    "        df = pd.concat(df_list).rename(columns={'id': 'illust_id'})\n",
    "        df['user_id'] = df.user.apply(lambda d: d['id'])\n",
    "        return df.set_index('illust_id')\n",
    "\n",
    "    def GetIllustRanking(self, mode, date, total=100):\n",
    "        \"获取作品排行榜\"\n",
    "        df_list = []\n",
    "        next_qs = {'mode': mode, 'date': date, 'filter': 'for_ios'}\n",
    "\n",
    "        with tqdm(total=total, desc=\"api.illust_ranking\") as pbar:\n",
    "            while next_qs != None:\n",
    "                json_result = self.api.illust_ranking(**next_qs)\n",
    "                tmp_df = pd.DataFrame.from_dict(json_result.illusts)\n",
    "                df_list.append(tmp_df)\n",
    "                pbar.update(tmp_df.shape[0])\n",
    "                next_qs = self.api.parse_qs(json_result.next_url)\n",
    "                self.randSleep(0.3)\n",
    "\n",
    "        df = pd.concat(df_list).rename(columns={'id': 'illust_id'})\n",
    "        df['user_id'] = df.user.apply(lambda d: d['id'])\n",
    "        return df.set_index('illust_id')\n",
    "\n",
    "    def GetFollowingUsers(self, user_id, restrict='public'):\n",
    "        \"获取指定用户跟踪的用户列表，返回user_ids\"\n",
    "        user_ids = []\n",
    "        next_qs = {'user_id': user_id, 'restrict': restrict}\n",
    "\n",
    "        user = self.GetUserDetail(user_id)\n",
    "        with tqdm(total=user.profile.total_follow_users,\n",
    "                  desc=\"api.user_following\") as pbar:\n",
    "            while next_qs != None:\n",
    "                json_result = self.api.user_following(**next_qs)\n",
    "                for one_user in json_result.user_previews:\n",
    "                    user_ids.append(one_user.user.id)\n",
    "                pbar.update(len(json_result.user_previews))\n",
    "                next_qs = self.api.parse_qs(json_result.next_url)\n",
    "                self.randSleep(0.3, 0.8)\n",
    "        return np.array(user_ids)\n",
    "\n",
    "    def UpdateIllusts(self, df_illusts):\n",
    "        sql_df = df_illusts.copy()\n",
    "\n",
    "        # 数组类字段转json\n",
    "        sql_df['image_urls'] = sql_df.image_urls.apply(json.dumps)\n",
    "        sql_df['meta_pages'] = sql_df.meta_pages.apply(json.dumps)\n",
    "        sql_df['meta_single_page'] = sql_df.meta_single_page.apply(json.dumps)\n",
    "        sql_df['series'] = sql_df.series.apply(json.dumps)\n",
    "        sql_df['tags'] = sql_df.tags.apply(json.dumps)\n",
    "        sql_df['tools'] = sql_df.tools.apply(json.dumps)\n",
    "        sql_df['user'] = sql_df.user.apply(json.dumps)\n",
    "\n",
    "        # 先读取文件里的illusts存储，并用新的数据代替key相同的内容\n",
    "        if os.path.isfile(self.illust_db):\n",
    "            # 读取文件的数据并丢弃同样的illust_id (保留新的illust_id)\n",
    "            db_df = self.DBIllusts(ensure_json=False)\n",
    "            db_df = db_df[~db_df.index.isin(sql_df.index)]\n",
    "            merged_df = pd.concat([sql_df, db_df], sort=False)\n",
    "        else:\n",
    "            merged_df = sql_df\n",
    "\n",
    "        # 合并后df写入文件(replace方式)\n",
    "        engine = create_engine('sqlite:///' + self.illust_db, echo=False)\n",
    "        merged_df.to_sql('illusts', con=engine, if_exists='replace')\n",
    "        return merged_df\n",
    "\n",
    "    def DBIllusts(self, sql=\"SELECT * FROM illusts WHERE illust_id > 0\", ensure_json=True):\n",
    "        with lite.connect(self.illust_db) as conn:\n",
    "            sql_df = pd.read_sql_query(sql, conn, index_col='illust_id')\n",
    "\n",
    "        # 还原json字段\n",
    "        if ensure_json:\n",
    "            sql_df['image_urls'] = sql_df.image_urls.apply(json.loads)\n",
    "            sql_df['meta_pages'] = sql_df.meta_pages.apply(json.loads)\n",
    "            sql_df['meta_single_page'] = sql_df.meta_single_page.apply(\n",
    "                json.loads)\n",
    "            sql_df['series'] = sql_df.series.apply(json.loads)\n",
    "            sql_df['tags'] = sql_df.tags.apply(json.loads)\n",
    "            sql_df['tools'] = sql_df.tools.apply(json.loads)\n",
    "            sql_df['user'] = sql_df.user.apply(json.loads)\n",
    "        return sql_df\n",
    "\n",
    "\n",
    "crawl = PixivCrawler(api)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## GetUserBookmarks(public)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_bookmarks = crawl.GetUserBookmarks(user_id)\n",
    "_ = crawl.UpdateIllusts(df_bookmarks)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## GetFollowingUsers(public)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_ids = crawl.GetFollowingUsers(user_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "random.shuffle(user_ids)\n",
    "for uid in tqdm(user_ids, desc=\"GetFollowingUsers\"):\n",
    "    df = crawl.GetUserIllusts(uid)\n",
    "    _ = crawl.UpdateIllusts(df)\n",
    "    crawl.randSleep(1.1, 5.0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## GetIllustRanking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# mode: [day, week, month, day_male, day_female, week_original, week_rookie, day_manga]\n",
    "# date: '2016-08-01'\n",
    "# mode (Past): [day, week, month, day_male, day_female, week_original, week_rookie,\n",
    "#               day_r18, day_male_r18, day_female_r18, week_r18, week_r18g]\n",
    "df_ranking = crawl.GetIllustRanking('week', '2019-11-01')\n",
    "_ = crawl.UpdateIllusts(df_ranking)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
